414333 - Özgür Polat
417121 - Hüseyin Can Minareci
With more and more consumers abandoning their credit card programs, a manager at the bank is concerned. They would really appreciate if one could foresee who would be churned for them so that they can proactively go to the consumer and offer better value to them and turn the decisions of customers in the opposite direction. Thus, they aggregated this dataset and the source we acquired it received it from here
Here in this project we will try to enlight the big picture a bit more with the capabilities we gained thanks to the Advanced Visualization in R course in Faculty of Economical Sciences at the University of Warsaw.
It is the best to start with understanding the variables we have and their definitions.
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(gridExtra)
library(cowplot)
library(ggforce)
library(GGally)
library(ggpubr)
churn <- read.csv("Data/churn.csv", sep = ',', stringsAsFactors = TRUE)
churnN <- read.csv("Data/churn.csv", sep = ',', na.strings = c("NA", "N/A", "Unknown"), stringsAsFactors = TRUE)
colSums(churn[,c(6,7,8)]=="Unknown")
## Education_Level Marital_Status Income_Category
## 1519 749 1112
## I think we should remove those Unknowns
churnNN <- drop_na(churnN)
## Drop ClientNum column here
## Make Attrition Flag 1-0
prop.table(table(churn$Attrition_Flag))
##
## Attrited Customer Existing Customer
## 0.1606596 0.8393404
prop.table(table(churnNN$Attrition_Flag))
##
## Attrited Customer Existing Customer
## 0.1571812 0.8428188
After dropping Unknowns we are having very similar distribution and I would say lets drop it in order to have better EDA
summary(churn)
## CLIENTNUM Attrition_Flag Customer_Age Gender
## Min. :708082083 Attrited Customer:1627 Min. :26.00 F:5358
## 1st Qu.:713036770 Existing Customer:8500 1st Qu.:41.00 M:4769
## Median :717926358 Median :46.00
## Mean :739177606 Mean :46.33
## 3rd Qu.:773143533 3rd Qu.:52.00
## Max. :828343083 Max. :73.00
##
## Dependent_count Education_Level Marital_Status Income_Category
## Min. :0.000 College :1013 Divorced: 748 $120K + : 727
## 1st Qu.:1.000 Doctorate : 451 Married :4687 $40K - $60K :1790
## Median :2.000 Graduate :3128 Single :3943 $60K - $80K :1402
## Mean :2.346 High School :2013 Unknown : 749 $80K - $120K :1535
## 3rd Qu.:3.000 Post-Graduate: 516 Less than $40K:3561
## Max. :5.000 Uneducated :1487 Unknown :1112
## Unknown :1519
## Card_Category Months_on_book Total_Relationship_Count
## Blue :9436 Min. :13.00 Min. :1.000
## Gold : 116 1st Qu.:31.00 1st Qu.:3.000
## Platinum: 20 Median :36.00 Median :4.000
## Silver : 555 Mean :35.93 Mean :3.813
## 3rd Qu.:40.00 3rd Qu.:5.000
## Max. :56.00 Max. :6.000
##
## Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit
## Min. :0.000 Min. :0.000 Min. : 1438
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 2555
## Median :2.000 Median :2.000 Median : 4549
## Mean :2.341 Mean :2.455 Mean : 8632
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:11068
## Max. :6.000 Max. :6.000 Max. :34516
##
## Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
## Min. : 0 Min. : 3 Min. :0.0000 Min. : 510
## 1st Qu.: 359 1st Qu.: 1324 1st Qu.:0.6310 1st Qu.: 2156
## Median :1276 Median : 3474 Median :0.7360 Median : 3899
## Mean :1163 Mean : 7469 Mean :0.7599 Mean : 4404
## 3rd Qu.:1784 3rd Qu.: 9859 3rd Qu.:0.8590 3rd Qu.: 4741
## Max. :2517 Max. :34516 Max. :3.3970 Max. :18484
##
## Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## Min. : 10.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 45.00 1st Qu.:0.5820 1st Qu.:0.0230
## Median : 67.00 Median :0.7020 Median :0.1760
## Mean : 64.86 Mean :0.7122 Mean :0.2749
## 3rd Qu.: 81.00 3rd Qu.:0.8180 3rd Qu.:0.5030
## Max. :139.00 Max. :3.7140 Max. :0.9990
##
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
## Min. :0.0000077
## 1st Qu.:0.0000990
## Median :0.0001815
## Mean :0.1599975
## 3rd Qu.:0.0003373
## Max. :0.9995800
##
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
## Min. :0.00042
## 1st Qu.:0.99966
## Median :0.99982
## Mean :0.84000
## 3rd Qu.:0.99990
## Max. :0.99999
##
### I don't remember why we did Attrition_Flag 0-1??? Doesn't make sense now so I comment it out if you will need uncomment line 91 and tell me for what we are gonna use it.
# If customer left the bank 1 if stayed 0
# churnNN$Attrition_Flag <- ifelse(churnNN$Attrition_Flag=="Attrited Customer", 1, 0)
# dropping the columns which is not useful for our analysis
churnNN$CLIENTNUM <- NULL
churnNN$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <- NULL
churnNN$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <- NULL
summary(churnNN)
## Attrition_Flag Customer_Age Gender Dependent_count
## Attrited Customer:1113 Min. :26.00 F:3375 Min. :0.000
## Existing Customer:5968 1st Qu.:41.00 M:3706 1st Qu.:1.000
## Median :46.00 Median :2.000
## Mean :46.35 Mean :2.338
## 3rd Qu.:52.00 3rd Qu.:3.000
## Max. :73.00 Max. :5.000
## Education_Level Marital_Status Income_Category Card_Category
## College : 844 Divorced: 569 $120K + : 572 Blue :6598
## Doctorate : 358 Married :3564 $40K - $60K :1412 Gold : 81
## Graduate :2591 Single :2948 $60K - $80K :1103 Platinum: 11
## High School :1653 $80K - $120K :1202 Silver : 391
## Post-Graduate: 431 Less than $40K:2792
## Uneducated :1204
## Months_on_book Total_Relationship_Count Months_Inactive_12_mon
## Min. :13.00 Min. :1.000 Min. :0.000
## 1st Qu.:31.00 1st Qu.:3.000 1st Qu.:2.000
## Median :36.00 Median :4.000 Median :2.000
## Mean :35.98 Mean :3.819 Mean :2.343
## 3rd Qu.:40.00 3rd Qu.:5.000 3rd Qu.:3.000
## Max. :56.00 Max. :6.000 Max. :6.000
## Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy
## Min. :0.000 Min. : 1438 Min. : 0 Min. : 3
## 1st Qu.:2.000 1st Qu.: 2498 1st Qu.: 463 1st Qu.: 1248
## Median :2.000 Median : 4287 Median :1282 Median : 3250
## Mean :2.454 Mean : 8493 Mean :1168 Mean : 7325
## 3rd Qu.:3.000 3rd Qu.:10729 3rd Qu.:1781 3rd Qu.: 9491
## Max. :6.000 Max. :34516 Max. :2517 Max. :34516
## Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1
## Min. :0.0000 Min. : 510 Min. : 10.0 Min. :0.0000
## 1st Qu.:0.6290 1st Qu.: 2089 1st Qu.: 44.0 1st Qu.:0.5830
## Median :0.7350 Median : 3831 Median : 67.0 Median :0.7000
## Mean :0.7606 Mean : 4394 Mean : 64.5 Mean :0.7115
## 3rd Qu.:0.8580 3rd Qu.: 4740 3rd Qu.: 80.0 3rd Qu.:0.8180
## Max. :3.3970 Max. :17995 Max. :134.0 Max. :3.7140
## Avg_Utilization_Ratio
## Min. :0.0000
## 1st Qu.:0.0260
## Median :0.1860
## Mean :0.2823
## 3rd Qu.:0.5150
## Max. :0.9990
# Ordering factor from smaller to bigger in order to have it in correct order in plots
churnNN$Income_Category <- ordered(churnNN$Income_Category, levels = c("Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"))
churnNN$Education_Level <- ordered(churnNN$Education_Level, levels = c("Uneducated", "High School", "College", "Graduate","Post-Graduate", "Doctorate"))
ggplot(churnNN, aes(x = Credit_Limit, fill = Income_Category)) +
geom_histogram(data = churnNN[,-8], fill = "grey", alpha = .5) +
geom_histogram(colour = "black") +
facet_wrap(~ Income_Category) +
guides(fill = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(churnNN, aes(x = Total_Revolving_Bal, fill = Attrition_Flag)) +
geom_histogram(data = churnNN[,-1], fill = "grey", alpha = .5) +
geom_histogram(colour = "black") +
facet_wrap(~ Attrition_Flag) +
guides(fill = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Explanation of this plot will be added…
my_comp <- list( c("Uneducated", "High School"), c("High School", "College"), c("College", "Graduate"), c("Graduate", "Post-Graduate"), c("Post-Graduate", "Doctorate") )
ggviolin(churnNN, x = "Education_Level", y = "Total_Revolving_Bal",
fill = "Education_Level", palette = "jco",
add = "boxplot", add.params = list(fill = "white")) +
stat_compare_means(method = 'anova') +
stat_compare_means(comparisons = my_comp)
library(heatmaply)
library(plotly)
library(ggcorrplot)
churn_numeric <- select_if(churnNN, is.numeric)
churn_ready_for_corr <- churn_numeric %>%
select(1:14)
# Compute correlation coefficients
corr <- churn_ready_for_corr %>%
cor()
# Compute correlation p-values
cor.test.p <- function(x){
FUN <- function(x, y) cor.test(x, y)[["p.value"]]
z <- outer(
colnames(x),
colnames(x),
Vectorize(function(i,j) FUN(x[,i], x[,j]))
)
dimnames(z) <- list(colnames(x), colnames(x))
z
}
p <- cor.test.p(churn_ready_for_corr)
# Create the heatmap
heatmaply_cor(
corr,
node_type = "scatter",
point_size_mat = -log10(p),
point_size_name = "-log10(p-value)",
label_names = c("x", "y", "Correlation")
)
Placeholder
# Clustering Variables
clusterData <- churnNN[, c("Credit_Limit", "Total_Revolving_Bal")]
clusters2 <- kmeans(clusterData, 3)
palette(c("#E41A1C", "#377EB8", "#4DAF4A",
"#984EA3", "#FF7F00", "#FFFF33",
"#A65628", "#F781BF", "#999999"))
par(mar = c(5.1, 4.1, 0, 1))
plot(clusterData,
col = clusters2$cluster,
pch = 20, cex = 3)
points(clusters2$centers, pch = 3, cex = 3, lwd = 3)